In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import plotly.express as px
In [2]:
data = pd.read_csv(r"C:\Users\Iddrisu Bachokun\Desktop\Python\credit_risk_analysis\credit_risk.csv")
data.head()
Out[2]:
Id Age Income Home Emp_length Intent Amount Rate Status Percent_income Default Cred_length
0 0 22 59000 RENT 123.0 PERSONAL 35000 16.02 1 0.59 Y 3
1 1 21 9600 OWN 5.0 EDUCATION 1000 11.14 0 0.10 N 2
2 2 25 9600 MORTGAGE 1.0 MEDICAL 5500 12.87 1 0.57 N 3
3 3 23 65500 RENT 4.0 MEDICAL 35000 15.23 1 0.53 N 2
4 4 24 54400 RENT 8.0 MEDICAL 35000 14.27 1 0.55 Y 4
In [3]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Id              32581 non-null  int64  
 1   Age             32581 non-null  int64  
 2   Income          32581 non-null  int64  
 3   Home            32581 non-null  object 
 4   Emp_length      31686 non-null  float64
 5   Intent          32581 non-null  object 
 6   Amount          32581 non-null  int64  
 7   Rate            29465 non-null  float64
 8   Status          32581 non-null  int64  
 9   Percent_income  32581 non-null  float64
 10  Default         32581 non-null  object 
 11  Cred_length     32581 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 3.0+ MB
In [4]:
data.describe()
Out[4]:
Id Age Income Emp_length Amount Rate Status Percent_income Cred_length
count 32581.000000 32581.000000 3.258100e+04 31686.000000 32581.000000 29465.000000 32581.000000 32581.000000 32581.000000
mean 16290.006139 27.734600 6.607485e+04 4.789686 9589.371106 11.011695 0.218164 0.170203 5.804211
std 9405.479594 6.348078 6.198312e+04 4.142630 6322.086646 3.240459 0.413006 0.106782 4.055001
min 0.000000 20.000000 4.000000e+03 0.000000 500.000000 5.420000 0.000000 0.000000 2.000000
25% 8145.000000 23.000000 3.850000e+04 2.000000 5000.000000 7.900000 0.000000 0.090000 3.000000
50% 16290.000000 26.000000 5.500000e+04 4.000000 8000.000000 10.990000 0.000000 0.150000 4.000000
75% 24435.000000 30.000000 7.920000e+04 7.000000 12200.000000 13.470000 0.000000 0.230000 8.000000
max 32780.000000 144.000000 6.000000e+06 123.000000 35000.000000 23.220000 1.000000 0.830000 30.000000
In [5]:
data.isna().sum()
Out[5]:
Id                   0
Age                  0
Income               0
Home                 0
Emp_length         895
Intent               0
Amount               0
Rate              3116
Status               0
Percent_income       0
Default              0
Cred_length          0
dtype: int64
In [6]:
data['Default'].unique()
Out[6]:
array(['Y', 'N'], dtype=object)
In [7]:
data['Cred_length'].unique()
Out[7]:
array([ 3,  2,  4,  8,  7,  6,  9, 10,  5, 11, 16, 15, 12, 13, 17, 14, 25,
       28, 27, 22, 19, 29, 23, 26, 20, 21, 30, 24, 18], dtype=int64)
In [8]:
data['Percent_income'].unique()
Out[8]:
array([0.59, 0.1 , 0.57, 0.53, 0.55, 0.25, 0.45, 0.44, 0.42, 0.16, 0.41,
       0.37, 0.32, 0.3 , 0.06, 0.29, 0.31, 0.22, 0.52, 0.14, 0.49, 0.13,
       0.5 , 0.35, 0.17, 0.27, 0.33, 0.08, 0.03, 0.21, 0.63, 0.47, 0.4 ,
       0.07, 0.38, 0.34, 0.04, 0.23, 0.15, 0.11, 0.43, 0.51, 0.02, 0.28,
       0.26, 0.19, 0.39, 0.09, 0.05, 0.61, 0.18, 0.6 , 0.01, 0.48, 0.12,
       0.54, 0.56, 0.46, 0.36, 0.24, 0.2 , 0.72, 0.64, 0.69, 0.77, 0.83,
       0.65, 0.67, 0.58, 0.71, 0.68, 0.7 , 0.66, 0.  , 0.76, 0.62, 0.78])
In [9]:
data['Status'].unique()
Out[9]:
array([1, 0], dtype=int64)
In [ ]:
 
In [10]:
data['Rate'].isna().sum()
Out[10]:
3116
In [11]:
data['Amount'].isna().sum()
Out[11]:
0
In [12]:
data['Intent'].unique()
Out[12]:
array(['PERSONAL', 'EDUCATION', 'MEDICAL', 'VENTURE', 'HOMEIMPROVEMENT',
       'DEBTCONSOLIDATION'], dtype=object)
In [13]:
data['Emp_length'].isna().sum()
Out[13]:
895
In [ ]:
 
In [14]:
df = data[["Age","Income","Home","Emp_length","Intent","Amount","Rate","Status","Percent_income","Default","Cred_length"]].dropna()
df["Deft"] = df.Default # reindexing to chane tthe posiion to the last
df.head()
Out[14]:
Age Income Home Emp_length Intent Amount Rate Status Percent_income Default Cred_length Deft
0 22 59000 RENT 123.0 PERSONAL 35000 16.02 1 0.59 Y 3 Y
1 21 9600 OWN 5.0 EDUCATION 1000 11.14 0 0.10 N 2 N
2 25 9600 MORTGAGE 1.0 MEDICAL 5500 12.87 1 0.57 N 3 N
3 23 65500 RENT 4.0 MEDICAL 35000 15.23 1 0.53 N 2 N
4 24 54400 RENT 8.0 MEDICAL 35000 14.27 1 0.55 Y 4 Y
In [15]:
df = df[["Age","Income","Home","Emp_length","Intent","Amount","Rate","Status","Percent_income","Cred_length","Deft"]]
df.head()
Out[15]:
Age Income Home Emp_length Intent Amount Rate Status Percent_income Cred_length Deft
0 22 59000 RENT 123.0 PERSONAL 35000 16.02 1 0.59 3 Y
1 21 9600 OWN 5.0 EDUCATION 1000 11.14 0 0.10 2 N
2 25 9600 MORTGAGE 1.0 MEDICAL 5500 12.87 1 0.57 3 N
3 23 65500 RENT 4.0 MEDICAL 35000 15.23 1 0.53 2 N
4 24 54400 RENT 8.0 MEDICAL 35000 14.27 1 0.55 4 Y
In [16]:
df.isna().sum()
Out[16]:
Age               0
Income            0
Home              0
Emp_length        0
Intent            0
Amount            0
Rate              0
Status            0
Percent_income    0
Cred_length       0
Deft              0
dtype: int64
In [17]:
for label in df.columns[:-1]:
    plt.hist(df[df['Deft']=='Y'][label],color = "red",label="Defaulted",alpha=.7,density=True)
    plt.hist(df[df['Deft']=='N'][label],color = "green",label="Defaulted",alpha=0.7,density=True)
    plt.title(label)
    plt.xlabel(label)
    plt.ylabel("probability")
    plt.legend()
    plt.show()

Box plot visualisation¶

In [18]:
fig = px.box(df, x= "Deft",
            color = "Deft",
            y ="Age",
            title = "Loan Default status based on Age",
            
            color_discrete_map ={"Y":"red",
                                "N":'green'})

fig.update_traces(quartilemethod='exclusive')
In [19]:
fig = px.box(df, x= "Deft",
            color = "Deft",
            y ="Income",
            title = "Loan Default status based on Income",
            
            color_discrete_map ={"Y":"red",
                                "N":'green'})

fig.update_traces(quartilemethod='exclusive')
In [20]:
fig = px.box(df, x= "Deft",
            color = "Deft",
            y ="Home",
            title = "Loan Default status based on Home",
            
            color_discrete_map ={"Y":"red",
                                "N":'green'})

fig.update_traces(quartilemethod='exclusive')
In [21]:
fig = px.box(df, x= "Deft",
            color = "Deft",
            y ="Emp_length",
            title = "Loan Default status based on Employment",
            
            color_discrete_map ={"Y":"red",
                                "N":'green'})

fig.update_traces(quartilemethod='exclusive')
fig

Pie Chart¶

In [22]:
home= df["Home"].value_counts()
transactions = home.index
quantity = home.values
fig = px.pie(df,
            values = quantity,
            names = transactions,
            title = "Debter's type of home")
fig.show()
In [23]:
emp= df["Emp_length"].value_counts()
transactions = emp.index
quantity = emp.values
fig = px.pie(df,
            values = quantity,
            names = transactions,
            title = "Debter's type of Employment status")
fig.show()
In [24]:
intent= df["Intent"].value_counts()
transactions = intent.index
quantity = intent.values
fig = px.pie(df,
            values = quantity,
            names = transactions,
            title = "Debter's type of intent")
fig.show()
In [25]:
status= df["Status"].value_counts()
transactions = status.index
quantity = status.values
fig = px.pie(df,
            values = quantity,
            names = transactions,
            title = "Debter's type of Status")
fig.show()

Cnverting the string into number for understaning of the computer¶

In [26]:
df["Deft"]=df["Deft"].map({"Y":1, 'N':0})

df["Home"] = df["Home"].map({"RENT":1,"MORTGAGE":2,"OWN":3,"OTHER":4})
df["Intent"] =df["Intent"].map({'PERSONAL':1,'EDUCATION':2,'MEDICAL':3,'VENTURE':4,'HOMEIMPROVEMENT':5,'DEBTCONSOLIDATION':6})
df.head()
Out[26]:
Age Income Home Emp_length Intent Amount Rate Status Percent_income Cred_length Deft
0 22 59000 1 123.0 1 35000 16.02 1 0.59 3 1
1 21 9600 3 5.0 2 1000 11.14 0 0.10 2 0
2 25 9600 2 1.0 3 5500 12.87 1 0.57 3 0
3 23 65500 1 4.0 3 35000 15.23 1 0.53 2 0
4 24 54400 1 8.0 3 35000 14.27 1 0.55 4 1

Training the Data¶

In [ ]:
 
In [29]:
from sklearn.preprocessing import StandardScaler
In [30]:
train,test,valid = np.split(df.sample(frac=1),[int(0.6*len(df)),int(0.8*len(df))])
In [31]:
def scle_dataset(dataframe):
    
    x = dataframe[dataframe.cols[:-1]].values
    y = dataframe[dataframe.col[-1]].values
    scaler = StandardScaler()
    x = scaler.fit_transform(x)
    
    
    if oversample:
        ros = RandomOverSampler()
        x ,y = ros.fit_resample(x,y)
        data =np.hsatck((x,np.resahpe(y,(-1,1))))
        
        return data ,x,y
In [32]:
print(len(train[train["Deft"]==0]))
14162
In [33]:
print(len(train[train["Deft"]==1]))
3020

Oversampling¶

In [34]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
In [35]:
def scale_dataset(dataframe,oversample=False):
    
    x = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values
    scaler = StandardScaler()
    x = scaler.fit_transform(x)
    
    
    if oversample:
        ros = RandomOverSampler()
        x ,y = ros.fit_resample(x,y)
    data =np.hstack((x,np.reshape(y,(-1,1))))
        
    return data ,x,y
In [36]:
train, xtrain, ytrain = scale_dataset(train, oversample=True)
valid, xvalid ,yvalid = scale_dataset(valid, oversample=False)
test, xtest, ytest = scale_dataset(test, oversample=False)
In [40]:
print(len(ytrain==0))
28324
In [41]:
print(len(ytrain==1))
28324

we now see that the data has the same length. We can now go ahead to prediction with the models¶

In [47]:
from sklearn.metrics import classification_report
In [43]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(xtrain,ytrain)
Out[43]:
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
In [44]:
y_pred = svm_model.predict(xtest)
In [49]:
print(classification_report(ytest,y_pred))
              precision    recall  f1-score   support

           0       0.98      0.76      0.86      4688
           1       0.46      0.93      0.62      1040

    accuracy                           0.79      5728
   macro avg       0.72      0.84      0.74      5728
weighted avg       0.89      0.79      0.81      5728

we want to use our model to predict some credit risk data¶

In [51]:
input_data = (23,65500,1,4.0,3,35000,15.23,1,0.53,2)
input_data_np = np.asarray(input_data)
imput_data_re = input_data_np.reshape(1,-1)

pred = svm_model.predict(imput_data_re)
print(pred)
if(pred[0]==0):
    print("The debtor will not default payment")
    
else:
    print("The debtor will default payment")
[0]
The debtor will not default payment

we see that our model is predicting corectly as expected

In [52]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(xtrain,ytrain)
Out[52]:
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [54]:
y_pred = knn_model.predict(xtest)
In [55]:
print(classification_report(ytest,y_pred))
              precision    recall  f1-score   support

           0       0.93      0.77      0.84      4688
           1       0.41      0.73      0.52      1040

    accuracy                           0.76      5728
   macro avg       0.67      0.75      0.68      5728
weighted avg       0.83      0.76      0.78      5728

In [58]:
input_data = (24,54400,1,8.0,3,35000,14.27,1,0.55,4)
input_data_np = np.asarray(input_data)
imput_data_re = input_data_np.reshape(1,-1)

pred = knn_model.predict(imput_data_re)
print(pred)
if(pred[0]==0):
    print("The debtor will not default payment")
    
else:
    print("The debtor will default payment")
[1]
The debtor will default payment
In [ ]: